Get packages:

# General packages for stuff
library(tidyverse)
library(here)
library(janitor)
library(plotly)

# Packages for spatial stuff & point pattern analysis
library(tmap)
library(sf)
library(spatstat)
library(maptools)
library(sp)
library(raster)

# Packages for cluster analysis:
library(NbClust)
library(cluster)
library(factoextra)
library(dendextend)
library(ggdendro)

Get data:

voles <- read_sf(dsn = here("data","redtreevoledata"), 
                 layer = "ds033") %>% 
  dplyr::select(COUNTY) %>% 
  filter(COUNTY == "HUM") %>% 
  st_transform(crs = 4326)

# plot(voles)

# Get Humboldt County outline
humboldt <- read_sf(dsn = here("data","redtreevoledata"), 
                    layer = "california_county_shape_file") %>% 
  filter(NAME == "Humboldt") %>% 
  dplyr::select(NAME)

st_crs(humboldt) <- 4326

# plot(humboldt)

# Plot them together: 
tm_shape(humboldt) +
  tm_fill() +
  tm_shape(voles) +
  tm_dots(size = 0.2)

# Or with ggplot2: 
ggplot() +
  geom_sf(data = humboldt) +
  geom_sf(data = voles)

# Or save it:
ggsave(here("figures","humvoles.png"),
       units = "in", 
       width = 4, 
       height = 6, 
       dpi = 300)

# Another example (with tiff...there's also jpeg, png, etc.)

# tiff("humvoles2.tiff", units = "in", width = 5, height = 5, res = 300)

# Customize it however you want: 
ggplot() +
  geom_sf(data = humboldt, fill = "black") +
  geom_sf(data = voles, color = "red", alpha = 0.5)

# dev.off()

We want to explore point patterns in a few different ways. Quadrat analysis, nearest neighbor analysis, etc. to compare with.

First we need to convert to ‘ppp’ and ‘owin’ - the points and windows, as used by maptools and spatstat (because sf is still catching up for raster and point pattern analysis stuff)…

## UPDATE!
voles_sp <- as(voles,"Spatial")
voles_ppp <- as(voles_sp, "ppp")

# Breaks in 244 Winter 2020
# Projection issue - switch to 6345 above and this works KIND OF -- but seems problematic. Need to fix for 244 Winter 2021!!! 

humboldt_sp <- as(humboldt, "Spatial")
humboldt_win <- as(humboldt_sp, "owin")

voles_pb <- ppp(voles_ppp$x, voles_ppp$y, window = humboldt_win)

plot(voles_pb)

vole_qt <- quadrat.test(voles_pb, nx = 5, ny = 10) # nx and ny are number of columns/rows for the rectangles created 

# Returns: VoleQT
# Chi-squared test of CSR using quadrat counts

# data:  VolePPP 
# X-squared = 425.94, df = 45, p-value < 2.2e-16
# alternative hypothesis: two.sided 
# Reject the null hypothesis of spatial evenness! But we still don't know if more clustered or more uniform...

plot(voles_pb)
plot(vole_qt, add = TRUE, cex = 0.4)

Plot densities:

point_density <- density(voles_pb, sigma = 0.02)
plot(point_density)

# Can you start viewing this in tmap? Yes, rasterize it: 
wgs84 = "+proj=longlat +ellps=WGS84 +datum=WGS84 +no_defs"
vole_raster <- raster(point_density, crs = wgs84)

# Then plot: 
tm_shape(vole_raster) +
  tm_raster(midpoint = NA, 
            palette = "Blues", 
            legend.show = FALSE)

Nearest neighbor (G-function)

r <- seq(0,0.15, by = 0.005)

gfunction <- envelope(voles_pb, fun = Gest, r = r, nsim = 100, nrank = 2) # Sig level of Monte Carlo = 0.04
## Generating 100 simulations of CSR  ...
## 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 16, 17, 18, 19, 20, 21, 22, 23, 24, 25, 26, 27, 28, 29, 30, 31, 32, 33, 34, 35, 36, 37, 38, 39, 40,
## 41, 42, 43, 44, 45, 46, 47, 48, 49, 50, 51, 52, 53, 54, 55, 56, 57, 58, 59, 60, 61, 62, 63, 64, 65, 66, 67, 68, 69, 70, 71, 72, 73, 74, 75, 76, 77, 78, 79, 80,
## 81, 82, 83, 84, 85, 86, 87, 88, 89, 90, 91, 92, 93, 94, 95, 96, 97, 98, 99,  100.
## 
## Done.
plot(gfunction$obs ~ gfunction$r, type = "l", col = "black", lty = 11)
lines(gfunction$hi ~ gfunction$r, type = "l", col = "blue", lty = 8)
lines(gfunction$theo ~ gfunction$r, type = "l", col = "red", lty = 6)
lines(gfunction$lo ~ gfunction$r, type = "l", col = "green", lty = 4)

# Confirms, in combination with quadrat.test, clustered data!

Nearest Neighbor by Ripley’s K (using L standardization)

r2 <- seq(0,0.5, by = 0.05)

lfunction <- envelope(voles_pb, fun = Lest, r = r2, nsim = 20, rank = 2, global = TRUE)
## Generating 20 simulations of CSR  ...
## 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 16, 17, 18, 19,  20.
## 
## Done.
plot(lfunction$obs ~ lfunction$r, type = "l", col = "black", lty = 11)
lines(lfunction$hi ~ lfunction$r, type = "l", col = "blue", lty = 8)
lines(lfunction$theo ~ lfunction$r, type = "l", col = "red", lty = 6)
lines(lfunction$lo ~ lfunction$r, type = "l", col = "green", lty = 4)

Diggle-Cressie-Loosmore-Ford test of CSR

DCLFTest <- dclf.test(voles_pb, nsim = 100, rank = 2) 
## Generating 100 simulations of CSR  ...
## 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 16, 17, 18, 19, 20, 21, 22, 23, 24, 25, 26, 27, 28, 29, 30, 31, 32, 33, 34, 35, 36, 37, 38, 39, 40,
## 41, 42, 43, 44, 45, 46, 47, 48, 49, 50, 51, 52, 53, 54, 55, 56, 57, 58, 59, 60, 61, 62, 63, 64, 65, 66, 67, 68, 69, 70, 71, 72, 73, 74, 75, 76, 77, 78, 79, 80,
## 81, 82, 83, 84, 85, 86, 87, 88, 89, 90, 91, 92, 93, 94, 95, 96, 97, 98, 99,  100.
## 
## Done.
DCLFTest
## 
##  Diggle-Cressie-Loosmore-Ford test of CSR
##  Monte Carlo test based on 100 simulations
##  Summary function: K(r)
##  Reference function: theoretical
##  Alternative: two.sided
##  Interval of distance values: [0, 0.250888]
##  Test statistic: Integral of squared absolute deviation
##  Deviation = observed minus theoretical
## 
## data:  voles_pb
## u = 0.001952, rank = 1, p-value = 0.009901

Intro to cluster analysis (k-means, hierarchical)

k-means (partition-based)

Part 1. K-means clustering:

Nevermind…back to irises dataset?

iris_nice <- iris %>% 
  clean_names()

ggplot(iris_nice) +
  geom_point(aes(x = petal_length, y = petal_width, color = species))

# How many clusters do you THINK there should be? 
number_est <- NbClust(iris_nice[1:4], min.nc = 2, max.nc = 10, method = "kmeans")

## *** : The Hubert index is a graphical method of determining the number of clusters.
##                 In the plot of Hubert index, we seek a significant knee that corresponds to a 
##                 significant increase of the value of the measure i.e the significant peak in Hubert
##                 index second differences plot. 
## 

## *** : The D index is a graphical method of determining the number of clusters. 
##                 In the plot of D index, we seek a significant knee (the significant peak in Dindex
##                 second differences plot) that corresponds to a significant increase of the value of
##                 the measure. 
##  
## ******************************************************************* 
## * Among all indices:                                                
## * 11 proposed 2 as the best number of clusters 
## * 11 proposed 3 as the best number of clusters 
## * 1 proposed 8 as the best number of clusters 
## * 1 proposed 10 as the best number of clusters 
## 
##                    ***** Conclusion *****                            
##  
## * According to the majority rule, the best number of clusters is  2 
##  
##  
## *******************************************************************
# By these estimators, 2 is the best number of clusters...but should that change our mind? Maybe...

# What if we consider similarities across all four variables? 

iris_km <- kmeans(iris_nice[1:4], 3) # kmeans specifying 3 groups!

iris_km$size
## [1] 33 21 96
iris_km$centers
##   sepal_length sepal_width petal_length petal_width
## 1     5.175758    3.624242     1.472727   0.2727273
## 2     4.738095    2.904762     1.790476   0.3523810
## 3     6.314583    2.895833     4.973958   1.7031250
iris_km$cluster
##   [1] 1 2 2 2 1 1 1 1 2 2 1 1 2 2 1 1 1 1 1 1 1 1 1 1 2 2 1 1 1 2 2 1 1 1 2 1 1
##  [38] 1 2 1 1 2 2 1 1 2 1 2 1 1 3 3 3 3 3 3 3 2 3 3 2 3 3 3 3 3 3 3 3 3 3 3 3 3
##  [75] 3 3 3 3 3 3 3 3 3 3 3 3 3 3 3 3 3 3 3 2 3 3 3 3 2 3 3 3 3 3 3 3 3 3 3 3 3
## [112] 3 3 3 3 3 3 3 3 3 3 3 3 3 3 3 3 3 3 3 3 3 3 3 3 3 3 3 3 3 3 3 3 3 3 3 3 3
## [149] 3 3
# Bind the cluster number to the original data

iris_cl <- data.frame(iris_nice, cluster_no = factor(iris_km$cluster))

ggplot(iris_cl) +
  geom_point(aes(x = sepal_length, y = sepal_width, color = cluster_no))

A little better…

ggplot(iris_cl) +
  geom_point(aes(x = petal_length, 
                 y = petal_width, 
                 color = cluster_no, 
                 pch = species)) +
  scale_color_brewer(palette = "Set2")

Make it 3D with plot_ly()…

# Or, a 3D plot with plotly

plot_ly(x = iris_cl$petal_length, 
        y = iris_cl$petal_width, 
        z = iris_cl$sepal_width, 
        type = "scatter3d", 
        color = iris_cl$cluster_no, 
        symbol = ~iris_cl$species,
        marker = list(size = 3),
        colors = "Set1")

####Part 2. Cluster analysis: hierarchical

Hierarchical cluster analysis (dendrograms) in R

Relevant functions:

stats::hclust() - agglomerative hierarchical clustering cluster::diana() - divisive hierarchical clustering

We’ll be using WorldBank environmental data (simplified), wb_env.csv

# Get the data
wb_env <- read_csv(here("data", "wb_env.csv"))

# Only keep top 20 greenhouse gas emitters (for simplifying visualization here...)
wb_ghg_20 <- wb_env %>% 
  arrange(-ghg) %>% 
  head(20)

# Scale it (can consider this for k-means clustering, too...)
wb_scaled <- as.data.frame(scale(wb_ghg_20[3:7]))

# Update to add rownames (country name)
rownames(wb_scaled) <- wb_ghg_20$name

# Compute dissimilarity values (Euclidean distances):
diss <- dist(wb_scaled, method = "euclidean")

# Hierarchical clustering (complete linkage)
hc_complete <- hclust(diss, method = "complete" )

# Plot it (base plot):
plot(hc_complete, cex = 0.6, hang = -1)

Divisive clustering:

hc_div <- diana(diss)

plot(hc_div, hang = -1)

rect.hclust(hc_div, k = 4, border = 2:5)

We might want to compare those…because they differ slightly.

# Convert to class dendrogram
dend1 <- as.dendrogram(hc_complete)
dend2 <- as.dendrogram(hc_div)

# Combine into list
dend_list <- dendlist(dend1,dend2)

# Make a tanglegram
tanglegram(dend1, dend2)

# Convert to class 'dendro' for ggplotting
data1 <- dendro_data(hc_complete)

# Simple plot with ggdendrogram
ggdendrogram(hc_complete, 
             rotate = TRUE) +
  theme_minimal() +
  labs(x = "Country")

# Want to do it actually in ggplot? Here: 
label_data <- bind_cols(filter(segment(data1), x == xend & x%%1 == 0), label(data1))

ggplot() + 
geom_segment(data=segment(data1), aes(x=x, y=y, xend=xend, yend=yend)) +
geom_text(data=label_data, aes(x=xend, y=yend, label=label, hjust=0), size=2) +
coord_flip() + 
scale_y_reverse(expand=c(0.2, 0)) +
theme_bw() +
theme(panel.border = element_blank(),
      panel.grid.major = element_blank(), 
      panel.grid.minor = element_blank(),
      axis.line = element_blank(),
      axis.title = element_blank(),
      axis.text = element_blank(),
      axis.ticks = element_blank(),
      legend.position = "None")